/*==============================================================================
FILE NAME: Create_Air_Panel_Week.do
INPUTS: NSR_Permits_Clean.dta, TitleV_Permits_Clean.dta, facility_characteristics.dta, Investigations_Clean.dta, Notice_of_Violation_Clean.dta, Enforcements_Clean.dta, 
OUTPUTS: Air_Panel_Week_Investigated_Clean.dta
CREATED: 21 September 2021
UPDATED: 5 January 2022
==============================================================================*/

//put together RN-by-month panel
clear all

/* Set directory if working independently through code
if c(username)=="" { //insert username
	global rootdir "" // insert root path
	global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 
*/

set more off
//1. create a file that includes the RNs of facilities that have a Title V, NSR or both permits
//panel will focus on RNs that have a Title V permit, NSR permit or both
//assumption is that PNs uniquely identify a permit
use "$processed_data/NSR_Permits_Clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
keep RN_id NSR
count if RN_id==.
drop if RN_id==.
sort RN_id
by RN_id: gen nobs=_N
tab nobs
duplicates drop
drop nobs
sort RN_id
unique RN_id
save "$processed_data/NSR_temp.dta", replace

use "$processed_data/TitleV_Permits_Clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
keep RN_id TitleV
count if RN_id==.
drop if RN_id==.
sort RN_id
by RN_id: gen nobs=_N
tab nobs
duplicates drop
drop nobs
sort RN_id
unique RN_id
save "$processed_data/TitleV_temp.dta", replace

use "$processed_data/facility_characteristics.dta", clear
//The variables County and Q are identical and all non-missing.
rename RegulatedEntityNo RN
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
keep RN_id County TCEQRegion
sort TCEQRegion
egen region_id=group(TCEQRegion)
label var region_id "numeric identifier for TCEQRegion"
drop TCEQRegion
duplicates drop
//RN uniquely identifies obs at this point
sort RN_id
save "$processed_data/facilities_temp.dta", replace

//how many RNs have NSR &/or Title V permits?
use "$processed_data/TitleV_temp.dta", clear
append using "$processed_data/NSR_temp.dta"
replace NSR=0 if NSR==.
replace TitleV=0 if TitleV==.
sort RN_id
collapse (sum) NSR TitleV, by(RN_id)
tab NSR TitleV, row column
//only 4% of RNs that have an NSR permit also have a Title V permit
//almost 96% of RNs that have a Title V permit also have an NSR permit
sort RN_id
save "$processed_data/TitleV_NSR_temp.dta", replace
//this file contains the RNs of facilities that show up in the Title V &/or NSR data
//this file will form the cross-sectional basis of the panel

erase "$processed_data/TitleV_temp.dta"
erase "$processed_data/NSR_temp.dta"

//2. merge with facilities data to get county and region
//explore match between RNs with NSR &/or Title V permits and facility characteristics file

use "$processed_data/TitleV_NSR_temp.dta", clear
sort RN_id
merge 1:1 RN_id using "$processed_data/facilities_temp.dta"
tab _merge NSR, row column
//96% of RNs with NSR permit find a match in facilities data
tab _merge TitleV, row column
//99.5% of RNs with Title V permit find a match in facilities data
drop if _merge==2
drop _merge
sort RN_id
save "$processed_data/TitleV_NSR_temp.dta", replace

//3. set up scaffolding for RN-by-week panel for RNs with TitleV/NSR permits; 204 months between Jan2003 & Dec2019
use "$processed_data/TitleV_NSR_temp.dta", clear
keep RN_id
duplicates drop
sort RN_id
unique RN_id
//59556 unique RNs show up in TitleV/NSR permits data
//create RN-by-month panel structure for these RNs
gen year=2003
gen week=1
gen wdate=yw(year,week)
format wdate %tw
drop year week
//need to create numeric identifier for RN so we can xtset the data
//drop "RN" and destring
isid RN_id
//build scaffolding
xtset RN_id wdate
tsappend, add(883)
gen date=dofw(wdate)
gen month=month(date)
gen week = week(date)
gen year=year(date)
order year, after(wdate)
order week, after(year)
drop date
sort RN_id
sort RN_id wdate
merge m:1 RN_id using "$processed_data/TitleV_NSR_temp.dta"
drop _merge
order County, after(week)
save "$processed_data/Panel_scaffold_week.dta", replace

//4. merge in investigations
//separate by complaint/noncomplaint investigation first

use "$processed_data/Investigations_Clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
destring IN, replace
sort IN
gen week = week(InvestigationStartDate)
save "$processed_data/inv_temp_week.dta", replace

use "$processed_data/investigations_with_noe_nov.dta", clear
drop year
sort IN
merge 1:m IN using "$processed_data/inv_temp_week.dta"
keep if _merge==3
//this will exclude 959 investigations that did not merge successfully with the NOV or NOE files
drop _merge
gen wdate=yw(year,week)
//focus on air investigations
keep if Media=="AIR"
//all but 9 investigations with missing values for Media are before 2003
rename investigation air_investigation
keep IN complaint_inv air_investigation RN_id wdate day
duplicates drop
//at this point IN uniquely identifies obs
sort RN_id wdate day
//next step will give us # of air_investigations by RN-day
collapse (sum) air_investigation complaint_inv, by(RN_id wdate day)
//now create days investigated
sort RN_id wdate
gen day_air_inv=0
replace day_air_inv=1 if air_investigation>0
gen day_air_complaint_inv=0
replace day_air_complaint_inv=1 if complaint_inv>0
gen day_air_nocomplaint_inv=0
replace day_air_nocomplaint_inv=1 if air_investigation>complaint_inv
keep RN_id wdate day_air_inv day_air_complaint_inv day_air_nocomplaint_inv
sort RN_id wdate 
collapse (sum) day_air_inv day_air_complaint_inv day_air_nocomplaint_inv, by(RN_id wdate)
label var day_air_inv "# days RN had >=1 air investigation in week"
label var day_air_complaint_inv "# days RN had >=1 complaint air investigation in week"
label var day_air_nocomplaint_inv "# days RN had >=1 non-complaint air investigation in week"
sort RN_id wdate
//restrict to 2003 to 2019
format wdate %tw
keep if wdate>=tw(2003w1) & wdate<=tw(2019w52)
merge 1:1 RN_id wdate using "$processed_data/Panel_scaffold_week.dta"
drop if _merge==1
drop _merge
foreach x of varlist day_air_inv-day_air_nocomplaint_inv{
replace `x'=0 if `x'==.
}
sort RN_id wdate
gen p_air_inv=(day_air_inv>0)
label var p_air_inv "=1 if RN had at least one air investigation in month"
gen p_air_complaint_inv=(day_air_complaint_inv>0)
label var p_air_complaint_inv "=1 if RN had at least one complaint-related air investigation in month"
gen p_air_nocomplaint_inv=(day_air_nocomplaint_inv>0)
label var p_air_nocomplaint_inv "=1 if RN had at least one non-complaint-related air investigation in month"
save "$processed_data/Panel_inv_week.dta", replace

//5. merge in novs
use "$processed_data/Notice_of_Violation_Clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
//day month year variables in this file reflect timing of investigation not nov
drop day month year
gen day = day(VN_Date)
gen month = month(VN_Date)
gen week = week(VN_Date)
gen year = year(VN_Date)
//focus on air NOVs
keep if violation_air==1
//all but 4 observations that have missing violation_air occur before 2003
keep RN_id VN day week month year
gen wdate=yw(year,week)
format wdate %tw
duplicates drop
//the same RN VN appears on different dates
//create a count of unique VNs for each RN-month
keep RN_id VN wdate
duplicates drop
sort RN_id wdate
collapse (count) VN, by(RN_id wdate)
rename VN air_novs
label var air_novs "number of air NOVs received by RN in month"
sort RN_id wdate
merge 1:1 RN_id wdate using "$processed_data/Panel_inv_week.dta"
unique RN_id if _merge==3
drop if _merge==1
drop _merge
replace air_novs=0 if air_novs==.
sort RN_id wdate
gen p_air_nov=(air_novs>0)
label var p_air_nov "=1 if RN had at least one NOV in month"
save "$processed_data/Panel_inv_nov_week.dta", replace

//6. merge in noes
use "$processed_data/Enforcements_Clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
gen week = month(EN_Date)
keep RN_id VN day week month year enforcement_air
gen wdate=yw(year,week)
format wdate %tw
//focus on air NOEs
keep if enforcement_air==1
//of 511 observations with missing information on media (about 3%), 400 occur after 2003
//the same RN VN appears on different dates
//create a count of unique VNs for each RN-month
keep RN_id VN wdate
duplicates drop
sort RN_id wdate
collapse (count) VN, by(RN_id wdate)
rename VN air_noes
label var air_noes "number of air NOEs received by RN in month"
sort RN_id wdate
merge 1:1 RN_id wdate using "$processed_data/Panel_inv_nov_week.dta"
unique RN_id if _merge==3
drop if _merge==1
drop _merge
replace air_noes=0 if air_noes==.
sort RN_id wdate
gen p_air_noe=(air_noes>0)
label var p_air_noe "=1 if RN had at least one air NOE in month"
save "$processed_data/Panel_inv_nov_noe_week.dta", replace

//6a. merge in NOEs for Category A violations
use "$processed_data/Enforcements_Clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
gen week = month(EN_Date)
keep RN_id VN day week month year enforcement_air
gen wdate=yw(year,week)
format wdate %tw
//focus on air NOEs
keep if enforcement_air==1
//of 511 observations with missing information on media (about 3%), 400 occur after 2003
//the same RN VN appears on different dates
//create a count of unique VNs for each RN-month
keep RN_id VN wdate
duplicates drop
sort VN
merge m:1 VN using "$processed_data/NOE_cat.dta"
keep if _merge==3
keep if NOE_cat==1
sort RN_id wdate
collapse (count) VN, by(RN_id wdate)
rename VN catA_air_noes
label var catA_air_noes "number of Category A air NOEs received by RN in month"
sort RN_id wdate
merge 1:1 RN_id wdate using "$processed_data/Panel_inv_nov_noe_week.dta"
unique RN_id if _merge==3
drop if _merge==1
drop _merge
replace catA_air_noes=0 if catA_air_noes==.
sort RN_id wdate
gen p_catA_air_noe=(catA_air_noes>0)
label var p_catA_air_noe "=1 if RN had at least one Category A air NOE in month"
save "$processed_data/Panel_inv_nov_noe_catA_week.dta", replace

//7. merge in complaints
//now create count of complaints in month
use "$processed_data/incidents.dta", clear
drop if IncidentStatus=="REFERRED"
gen RN_id=substr(RegulatedEntity,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
rename ComplaintIncident CIN
gen temp = date(IncidentRecDate,"MDY")
drop IncidentRecDate 
rename temp IncidentRecDate
format IncidentRecDate %td
replace IncidentRecDate = . if IncidentRecDate < 0
gen year=year(IncidentRecDate)
gen week = week(IncidentRecDate)
gen month=month(IncidentRecDate)
gen wdate=yw(year,week)
format wdate %tw
//restrict to 2003 to 2019
keep if wdate>=tw(2003w1) & wdate<=tw(2019w52)
gen incident=1
keep CIN RN_id wdate incident
duplicates drop
isid CIN
//at this point CIN uniquely identifies observations
sort RN_id wdate
//this will give us # of complaints in the month
collapse (sum) incident, by(RN_id wdate)
label var incident "# of complaint incidents against RN in month"
sort RN_id wdate
merge 1:1 RN_id wdate using "$processed_data/Panel_inv_nov_noe_catA_week.dta"
unique RN_id if _merge==3
drop if _merge==1
drop _merge
replace incident=0 if incident==.
gen p_incident=(incident>0)
label var p_incident "=1 if RN had at least one complaint incident in month"
sort RN_id wdate
save "$processed_data/Panel_inv_nov_noe_inc_week.dta", replace
//now create count of air complaints in month
use "$processed_data/incidents.dta", clear
keep if Media=="AIR"
drop if IncidentStatus=="REFERRED"
gen RN_id=substr(RegulatedEntity,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
rename ComplaintIncident CIN
gen temp = date(IncidentRecDate,"MDY")
drop IncidentRecDate 
rename temp IncidentRecDate
format IncidentRecDate %td
replace IncidentRecDate = . if IncidentRecDate < 0
gen year=year(IncidentRecDate)
gen month=month(IncidentRecDate)
gen week=week(IncidentRecDate)
gen wdate=yw(year,week)
format wdate %tw
//restrict to 2003 to 2019
keep if wdate>=tw(2003w1) & wdate<=tw(2019w52)
gen air_incident=1
keep CIN RN_id wdate air_incident
duplicates drop
isid CIN
//at this point CIN uniquely identifies observations
sort RN_id wdate
//this will give us # of air complaints in the month
collapse (sum) air_incident, by(RN_id wdate)
label var air_incident "# of air complaint incidents against RN in month"
sort RN_id wdate
merge 1:1 RN_id wdate using "$processed_data/Panel_inv_nov_noe_inc_week.dta"
unique RN_id if _merge==3
drop if _merge==1
drop _merge
replace air_incident=0 if air_incident==.
gen p_air_incident=(air_incident>0)
label var p_air_incident "=1 if RN had at least one air complaint incident in month"
sort RN_id wdate
rename County county
sort RN_id wdate
order year, after(wdate)
order week, after(year)
order county, after(week)
order TitleV, after(county)
order NSR, after(TitleV)
save "$processed_data/Panel_week.dta", replace

//8. create a variable that identifies RNs never investigated bt 2003 & 2019
use "$processed_data/Panel_week.dta", clear
keep RN_id day_air_inv
sort RN_id
collapse (sum) day_air_inv, by(RN_id)
gen never_air_inv=0
replace never_air_inv=1 if day_air_inv==0
label var never_air_inv "=1 if RN never has air investigation bt 2003 & 2019"
sort RN_id
save "$processed_data/temp_week.dta", replace
use "$processed_data/Panel_week.dta"
merge m:1 RN_id using "$processed_data/temp_week.dta"
drop _merge
order never_air_inv, after(NSR)
save "$processed_data/Panel_week.dta", replace

//9. add in air emission events
use "$processed_data/Emissions_events.dta", clear
gen RN_id=substr(RegulatedEntityNo,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
rename IncidentNo EEIN
gen temp = date(IncidentStartDate,"MDY")
drop IncidentStartDate 
rename temp IncidentStartDate
format IncidentStartDate %td
replace IncidentStartDate = . if IncidentStartDate < 0
gen year=year(IncidentStartDate)
gen month=month(IncidentStartDate)
gen week=week(IncidentStartDate)
gen quarter = quarter(IncidentStartDate)
gen wdate=yw(year,week)
format wdate %tw
//restrict to 2003 to 2019
keep if wdate>=tw(2003w1) & wdate<=tw(2019w52)
gen ee_incident=1
keep EEIN RN_id wdate ee_incident
duplicates drop
isid EEIN
//at this point EEIN uniquely identifies observations
sort RN_id wdate
//this will give us # of emission events in the month
collapse (sum) ee_incident, by(RN_id wdate)
label var ee_incident "# of emissions events for RN in month"
sort RN_id wdate
merge 1:1 RN_id wdate using "$processed_data/Panel_week.dta"
unique RN_id if _merge==3
drop if _merge==1
drop _merge
replace ee_incident=0 if ee_incident==.
gen p_ee_incident=(ee_incident>0)
label var p_ee_incident "=1 if RN had at least one emission event incident in month"
sort RN_id wdate
save "$processed_data/Air_Panel_week.dta", replace

erase "$processed_data/Panel_inv_nov_noe_inc.dta"
erase "$processed_data/Panel_inv_nov_noe.dta"
erase "$processed_data/Panel_inv_nov.dta"
erase "$processed_data/Panel_inv.dta"
erase "$processed_data/facilities_temp.dta"
erase "$processed_data/inv_temp.dta"
